##############################################################################################
# Author: Subhendu Malakar
# Mentor: Mridul Sankar Barik
# About: This code inserts/updates the database with new information for securityfocus.com website.
# Working: This code crawls the website and extracts information from
#		   the html page to feed into the database. This information is
#		   stored in the database in a proper format. There are 3 options:
#			1. Full Crawl : This crawls all the web pages.
#			2. Range Crawl : This crawls only the range of pages specified by the user.
#			3. Sync Crawl : This crawls new pages which are not present in the database.
##############################################################################################

# Importing HTMLParser module for parsing the html page downloaded.
from html.parser import HTMLParser

# urllib.request module contains functions for downloading web pages.
import urllib.request

# urllib.error module contains functions for exception handling.
import urllib.error

# Importing psycopg2 module to connect with the PostGRESQL database.
import psycopg2

# For exit()
import sys

# For string manipulation functions.
import string

# For multithreading.
import threading

# For counting the number of cpus in the machine.
import multiprocessing

# For timeout.
import time

# This defines the max number of pages to crawl. 
end=24

# Class name: MyHTMLParser
# Inherits: HTMLParser
# Functions: __init__(self,strict=False)
#			handle_starttag(self,tag,attrs)
#			handle_endtag(self,tag)
#			handle_data(self,tag)
#			get_cve_id(self)
#			get_products(self)
#			get_attack_from(self)
# Description: This class is responsible to extract data from a html page.
#			handle_starttag() is invoked automatically when a starting tag is encountered.
#			handle_endtag() is automatically invoked whnen an ending tag is encountered.
#			handle_data() is called when data is handled.
#		parses cve_id,products,attack from
class MyHTMLParser(HTMLParser):

	# Constructor call.
	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Defining variables of this class.
		# There are 3 types of variables.
		# data_variables: these are the required information
		# data_check_variables: boolean values corresponding to each of the data_variables
		# 		to keep a check on the data been already extracted or not.
		# tag_check_variables: used for matching the proper format.
		self.cve_id=''
		self.products=[]
		self.attack_from=''
		self.cve=False
		self.prod=False
		self.attk=False
		self.tr=False
		self.rmt=False
		self.lcl=False

	# Automatically invoked when start tag is encountered. Used to initialize variables.
	def handle_starttag(self,tag,attrs):
		if tag == 'tr':
			self.tr=True
			self.prod=False

	# Automatically invoked when end tag is encountered.
	def handle_endtag(self,tag):
		if tag == 'tr':
			self.tr=False
			self.cve=False

	# Automatically invoked when data is to be handled.
	def handle_data(self,data):

		# Removing leading and trailing whitespace characters.
		data=data.strip()

		# Check if valid data or not.
		if len(data) == 0:
			return

		# Extraction of information.
		# As soon as the data is extracted , the corresponding data_check_variable is reset to False to avoid ambiguity.
		if self.cve:
			self.cve_id=data[-9:]
		if self.rmt:
			if 'Yes' in data:
				self.attack_from='Remote '
				self.rmt=False
		if self.lcl:
			if 'Yes' in data:
				self.attack_from+='Local'
				self.lcl=False
		if self.prod:
			self.products.append(data)

		# Condition checking.
		# Every information is kept in a format in the HTML page. To extract the information,
		# first check the conditions and if true for any of the data, then mark the corresponding
		# data_check_variable to true for next iteration where it will be extracted.	
		if self.tr:
			if data == 'CVE:':
				self.cve=True
			if data == 'Remote:':
				self.rmt = True
			if data == 'Local:':
				self.lcl = True
			if data == 'Vulnerable:':
				self.prod=True

	# Returns CVE ID ( numeric value only.)
	def get_cve_id(self):
		return self.cve_id

	# Returns afected products list.
	def get_products(self):
		return self.products

	# Returns the possible attack method:
	#		1. Network: from the network (LAN,internet,wifi,etc.)
	#		2. Local: locally(flash drives,etc.)
	def get_attack_from(self):
		return self.attack_from

# Class name: Parse_discuss
# Inherits: HTMLParser
# Functions: __init__(self,strict=False)
#			handle_starttag(self,tag,attrs)
#			handle_endtag(self,tag)
#			handle_data(self,tag)
#			get_description(self)
# Description: This class is responsible to extract data from a html page.
#			handle_starttag() is invoked automatically when a starting tag is encountered.
#			handle_endtag() is automatically invoked whnen an ending tag is encountered.
#			handle_data() is called when data is handled.
#			parses description
class Parse_discuss(HTMLParser):

	# Constructor call.
	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Defining variables of this class.
		# There are 3 types of variables.
		# data_variables: these are the required information
		# data_check_variables: boolean values corresponding to each of the data_variables
		# 		to keep a check on the data been already extracted or not.
		# tag_check_variables: used for matching the proper format.
		self.description=''
		self.desc=False
		self.span=False

	# Automatically invoked when start tag is encountered. Used to initialize variables.
	def handle_starttag(self,tag,attrs):
		if tag == 'div':
			for name,value in attrs:
				if name == 'id' and value == 'vulnerability':
					self.desc=True
		if tag == 'span':
			self.span=True

	# Automatically invoked when end tag is encountered.
	def handle_endtag(self,tag):
		if tag == 'div':
			self.desc=False
		if tag == 'span':
			self.span=False

	# Automatically invoked when data is to be handled.
	def handle_data(self,data):
		if self.desc and not self.span:
			self.description+=(data.strip() + ' ')

	# Returns desciption
	def get_description(self):
		return self.description
# class: Parse_impact
# inherits: HTMLParser
# desciption: parses imoact of the vulnerability.
class Parse_impact(HTMLParser):

	# Constructor call.
	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Declaring variables.
		self.impact=''
		self.impt=False
		self.span=False

	# Automatically invoked when start tag is encountered. Used to initialize variables.
	def handle_starttag(self,tag,attrs):
		if tag == 'div':
			for name,value in attrs:
				if name == 'id' and value == 'vulnerability':
					self.impt=True
		if tag == 'span':
			self.span=True

	# Automatically invoked when end tag is encountered.
	def handle_endtag(self,tag):
		if tag == 'div':
			self.impt=False
		if tag == 'span':
			self.span=False

	# Automatically invoked when data is to be handled.
	def handle_data(self,data):
		if self.impt and not self.span:
			if 'securityfocus.com' not in data:
				if len(data)<1000:
					self.impact+=(data.strip() + ' ')

	# Returns imapct
	def get_impact(self):
		return self.impact

# class: Parse_solution
# inherits: HTMLParser
# Description: Parses any possible solution.
class Parse_solution(HTMLParser):

	# Constructor call
	def __init__(self,strict=False):

		# Parent Constructor call
		HTMLParser.__init__(self,strict)

		# Declaring variables.
		self.solution=''
		self.sol=False
		self.span=False

	# Automatically invoked when start tag is encountered. Used to initialize variables.
	def handle_starttag(self,tag,attrs):
		if tag == 'div':
			for name,value in attrs:
				if name == 'id' and value == 'vulnerability':
					self.sol=True
		if tag == 'span':
			self.span=True

	# Automatically invoked when end tag is encountered.
	def handle_endtag(self,tag):
		if tag == 'div':
			self.sol=False
		if tag == 'span':
			self.span=False

	# Automatically invoked when data is to be handled.
	def handle_data(self,data):
		if self.sol and not self.span:
			if data == 'Solution:':
				return
			self.solution+=(data.strip() + ' ')

	# Returns solution
	def get_solution(self):
		return self.solution

# class: Parse_references
# inheris: HTMLParser
# description: parses references for further information.
class Parse_references(HTMLParser):

	# constructor call
	def __init__(self,strict=False):

		# parent constructor call
		HTMLParser.__init__(self,strict)

		# declaring variables
		self.references=[]
		self.ref=False
		self.span=False
		self.a=False

	# Automatically invoked when start tag is encountered. Used to initialize variables.
	def handle_starttag(self,tag,attrs):
		if tag == 'div':
			for name,value in attrs:
				if name == 'id' and value == 'vulnerability':
					self.ref=True
		if tag == 'span':
			self.span=True
		if tag == 'a':
			if self.ref:
				for name,value in attrs:
					if name =='href':
						self.references.append(value)

	# Automatically invoked when end tag is encountered.
	def handle_endtag(self,tag):
		if tag == 'div':
			self.ref=False
		if tag == 'span':
			self.span=False

	# Returns References
	def get_references(self):
		return self.references

# Class : Parser
# Description: It is a wrapper of all the above parsers.
class Parser():

	# constructor call
	def __init__(self,url):

		# creates url for the information page.
		# Adds header information
		# Creates an opener
		# Opens the webpage
		# Decodes it in utf-8 format
		# creates the corresponding parser object and feeds the html page to it.
		request = urllib.request.Request(url+'info')
		request.add_header("User-Agent", "My Crawler")
		opener = urllib.request.build_opener()
		f = opener.open(request)
		st = f.read().decode('utf-8');
		self.parse = MyHTMLParser()
		self.parse.feed(st)
		request = urllib.request.Request(url+'discuss')
		request.add_header("User-Agent", "My Crawler")
		opener = urllib.request.build_opener()
		f = opener.open(request)
		st = f.read().decode('utf-8');
		self.parse_desc = Parse_discuss()
		self.parse_desc.feed(st)
		request = urllib.request.Request(url+'exploit')
		request.add_header("User-Agent", "My Crawler")
		opener = urllib.request.build_opener()
		f = opener.open(request)
		st = f.read().decode('utf-8');
		self.parse_impt = Parse_impact()
		self.parse_impt.feed(st)
		request = urllib.request.Request(url+'solution')
		request.add_header("User-Agent", "My Crawler")
		opener = urllib.request.build_opener()
		f = opener.open(request)
		st = f.read().decode('utf-8');
		self.parse_sol = Parse_solution()
		self.parse_sol.feed(st)
		request = urllib.request.Request(url+'references')
		request.add_header("User-Agent", "My Crawler")
		opener = urllib.request.build_opener()
		f = opener.open(request)
		st = f.read().decode('utf-8');
		self.parse_ref = Parse_references()
		self.parse_ref.feed(st)

	def get_cve_id(self):
		return self.parse.get_cve_id()
		
	def get_description(self):
		return self.parse_desc.get_description()

	def get_impact(self):
		return self.parse_impt.get_impact()

	def get_solutions(self):
		return self.parse_sol.get_solution()

	def get_references(self):
		return ','.join(self.parse_ref.get_references())

	def get_products(self):
		return ','.join(self.parse.get_products())

	def get_attack_from(self):
		return self.parse.get_attack_from()

# class : MyThread
# inherits : threading.Thread
# Functions: __init__(self,bitmap,strt,END)
#			run(self)
#			common_function(self,conn,bitmap,strt,END)
# Description: This class enables the usage of threads for faster execution.
#			Each object generates its own working set and executes.	
class MyThread (threading.Thread):

	# Constructor call
	def __init__(self,bitmap,strt,end):

		# Parent Constructor call
		threading.Thread.__init__(self)

		# Initializing in-object variables.
		# bitmap contains data for the pages already crawled.
		self.bitmap=bitmap

		# The start point
		self.strt=strt

		# The end point
		self.end=end

	# This is run by default when thread.start() is called to start the thread.
	def run(self):
		try:
			# Creating a local connection for each thread.
			conn=psycopg2.connect("dbname='postgres' user='postgres' host='localhost' port='5432' password='postgres'")

		# If exception occurs.
		except:
			print('Database connection error.')

		# If everything goes fine.
		else:
			# Call the master function.
			self.common_function(conn,self.bitmap,self.strt,self.end)

		# Close the local connection to the database.
		conn.close()

	# This is the master function. it accepts a connection, start and end points and a bitmap of the pages already crawled.
	# This function tries to download all the pages between start and end points and store the information into the database.
	def common_function(self,conn,bitmap,strt=1,end=100000):

		# Common url for all the pages.
		url_head='http://www.securityfocus.com/bid/'
		i=strt
		# Creating a cursor.
		cur=conn.cursor();
		while i<end:
			# If data from the page is not already in the database.
			if(not bitmap[i]):

				# Append the id to the common url.
				url=url_head+str(i)+'/'
				print('Fetching the url: ',url)

				# Try to fetch get the HTML page by passing the url to the common parser.
				try:
					parse=Parser(url)

				# Catch exceptions.
				except URLError:
					print('URL Error for: ',url)
				except HTTPError:
					print('HTTP Error for: ',url)
				except HTMLParseError:
					print('HTMLParseError for: ',url)
				except:
					print('unknown error in parsing file.')
				else:
					print('Parsing Complete.')

				# Check if the result of parse is correct.
				# If correct result , then it will contain the CVE ID.
				# Else it will not contain any data.
				if parse.get_cve_id()!=' ':

					# Check if information about the CVE ID is already present or not.
					try:
						# Search for the CVE ID
						cur.execute("SELECT * FROM securityfocus WHERE cve_id=%s",(parse.get_cve_id(),))

						# Result of an execute() statement is stored in a buffer.
						# To get the information, we have to call fetch*().
						try:
							cur.fetchone()[0]

						# If no such CVE ID is found , then it is to be added to the database.
						except:
							cur.execute("INSERT INTO securityfocus (cve_id,securityfocus_id,description,attack_from,impact,products,solutions,links) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)",
							(parse.get_cve_id(),i,parse.get_description(),parse.get_attack_from(),parse.get_impact(),parse.get_products(),parse.get_solutions(),parse.get_references()))
							print('Inserted in Database: ',parse.get_cve_id())

							# Commiting the changes.
							conn.commit()

						# If CVE ID is found, then the new result is to be updated.
						else:
							print('ERROR: ',parse.get_cve_id(),' already present in the Database.\n Updating...')
							cur.execute("UPDATE securityfocus SET description=%s,attack_from=%s,impact=%s,products=%s,solutions=%s,links=%s WHERE cve_id=%s",(parse.get_description(),parse.get_attack_from(),parse.get_impact(),parse.get_products(),parse.get_solutions(),parse.get_references(),parse.get_cve_id()))

							# Commiting the changes.
							conn.commit();

					# Catching exceptions.
					except Exception as e:
						print('ERROR: Error in Inserting data into Database. \n Error Details: ',end='\t')
						print(e)
				# No CVE ID found in the page.
				else:
					print('ERROR:\tNo CVE-ID found in the Page')
					
			else:
				pass
			i+=1

		# Close the cursor after use.
		cur.close()		

# Global function to set parameters for full crawling of websites.	
def full_crawl():

	# Use the global END value.
	global end
	strt=1

	# Create a bitmap and set everything to false, i.e, everything is to be inserted/updated again.
	bitmap=[False]*end;
	return (bitmap,1,end)

# Global function to set parameters for crawling web pages in a user given range.
def range_crawl():

	# Take input from user	
	strt=int(input('Enter the Start Position: '))
	end=int(input('Enter the Last Position: '))

	# Take input from user
	bitmap=[False]*end;
	return (bitmap,strt,end)

# Global function to set parameters for crawling web pages which are recently added, i.e, pages added to the website after the last full/sync crawl.
def sync_crawl(conn):

	# Use the global END value.
	global end
	i=1
	
	# Initialize the bitmap to false.
	bitmap=[False]*end;
	# Create a cursor for the connection to the database.
	# This is to check for the pages already in the database.
	cur=conn.cursor()
	while i<end:

		# Select the webpage from the database.
		cur.execute("SELECT * FROM securityfocus WHERE securityfocus_id=%s",(i,))
		try:
			cur.fetchone()[0]

		# If not found, then mark it as False. i.e, it is to be crawled.
		except Exception:
			bitmap[i]=False

		# Else it is marked as True to avoid re-crawling.
		else:
			bitmap[i]=True
		i+=1

	# Close the cursor after use.
	cur.close()
	return (bitmap,1,end)
	
# Global function to start the execution.
def main():

	# Create a connection to the database.
	try:
		conn=psycopg2.connect("dbname='postgres' user='postgres' host='localhost' port='5432' password='postgres'")

	# Catch exception.
	except:
		print('Database connection error.');

	# If everything is fine.
	else:
		# Ask user for further course of action.
		inp=int(input('1. Full Crawl\n2. Range Crawl\n3. Sync Crawl\nEnter your choice(1/2/3):'))
		if inp==1:
			bitmap,strt,end=full_crawl()
		elif inp==2:
			bitmap,strt,end=range_crawl()
		elif inp==3:
			bitmap,strt,end=sync_crawl(conn)
		else:
			print('Wrong Input\nExiting...')
			sys.exit(1)

		# Commit any changes to the database.
		conn.commit()
		i=0

		# Count the number of cpus in the machine.
		no_of_threads=multiprocessing.cpu_count()

		# List of threads.
		thread=[]
		step=int((end-strt)/no_of_threads)

		# Create threads and initialize it with the start and end value.
		while i< no_of_threads:
			thread.append(MyThread(bitmap,strt,strt+step))
			strt=strt+step;
			i+=1
		i=0

		# Start the threads.
		while i<no_of_threads:

			# It invokes Thread.run()
			thread[i].start()
			i+=1

		# Close the connection.
		conn.close()
	return 0

# Ensure main() is the first function to be called.
if __name__ == '__main__':
	main()

